Libraries

library(rtracklayer)
library(GenomicFeatures)
library(txdbmaker) 
library(Biostrings)
library(BSgenome)
library(BSgenomeForge)
library(dplyr)
library(GenomicRanges)
library(GenomeInfoDb) 
library(Biostrings)
library(BSgenome)
library(ggplot2)
library(scales)
library(ggpointdensity)

Loading data - Loading seqs + filtering

library(BSgenome.Dlaeve.NCBI.dlgm)
genome <- BSgenome.Dlaeve.NCBI.dlgm
print(class(genome))

[1] “BSgenome”

attr(,“package”)

[1] “BSgenome”

length(genome)

[1] 3497

genome

| BSgenome object

| - organism: Deroceras laeve

| - provider: NCBI

| - genome: dl_gm

| - 3497 sequence(s):

| chr1 chr2 chr3 chr4

| chr5 chr6 chr7 chr8

| chr9 chr10 chr11 chr12

| chr13 chr14 chr15 chr16

| chr17 chr18 chr19 chr20

| … … … …

| HiC_scaffold_3481 HiC_scaffold_3482 HiC_scaffold_3483 HiC_scaffold_3484

| HiC_scaffold_3485 HiC_scaffold_3486 HiC_scaffold_3487 HiC_scaffold_3488

| HiC_scaffold_3489 HiC_scaffold_3490 HiC_scaffold_3491 HiC_scaffold_3492

| HiC_scaffold_3493 HiC_scaffold_3494 HiC_scaffold_3495 HiC_scaffold_3496

| HiC_scaffold_3497

|

| Tips: call ‘seqnames()’ on the object to get all the sequence names, call ‘seqinfo()’ to

| get the full sequence info, use the ‘$’ or ‘[[’ operator to access a given sequence, see

| ‘?BSgenome’ for more information.

# names present in your BSgenome
nm  <- seqnames(genome)

# choose only proper chromosomes (e.g. chr1..chr31), ignore the HiC_scaffold zoo
keep <- intersect(paste0("chr", 1:31), nm)          # safe even if some are missing

# grab them quickly as a DNAStringSet (lightweight, fast)
genome_chr <- getSeq(genome, keep)

# sanity check
length(genome_chr); head(names(genome_chr))

[1] 31## [1] “chr1” “chr2” “chr3” “chr4” “chr5” “chr6”

transposons <- import("C:/Users/rafae/OneDrive/Desktop/Genomics/Data2/Data3/Data4/HiTE_intact.CORRECTED.gff3")
transposons

GRanges object with 144324 ranges and 16 metadata columns:

seqnames ranges strand | source type

|

[1] chr1 319453-324433 + | HiTE TIR

[2] chr1 384708-385517 - | HiTE TIR

[3] chr1 388818-389375 + | HiTE TIR

[4] chr1 419867-421331 - | HiTE TIR

[5] chr1 501967-503074 - | HiTE TIR

… … … … . … …

[144320] HiC_scaffold_99 47879-54432 + | HiTE LTR

[144321] HiC_scaffold_99 47879-48671 + | HiTE long_terminal_repeat

[144322] HiC_scaffold_99 53640-54432 + | HiTE long_terminal_repeat

[144323] HiC_scaffold_99 54433-54436 + | HiTE target_site_duplication

[144324] HiC_scaffold_995 56985-57096 + | HiTE TIR

score phase id name classification

[1] NA te_intact_3587 TIR_939 LINE/I

[2] NA te_intact_96238 TIR_124 DNA/hAT

[3] NA te_intact_29975 TIR_335 LINE/RTE-RTE

[4] NA te_intact_98614 TIR_480 LINE/L1

[5] NA te_intact_87947 TIR_411 DNA/MULE

… … … … … …

[144320] NA LTRRT_861 HiC_scaffold_99:4787.. LTR/Gypsy

[144321] NA lLTR_861 HiC_scaffold_99:4787.. LTR/Gypsy

[144322] NA rLTR_861 HiC_scaffold_99:4787.. LTR/Gypsy

… [30 líneas truncadas]

# names you want, taken from your FASTA grab (chr1..chr31)
wanted <- seqlevels(genome_chr)

# if styles differ, normalize once (e.g., add/remove "chr")
seqlevelsStyle(transposons) <- seqlevelsStyle(genome_chr)

# keep only those seqlevels and prune ranges on others
transposons_chr <- keepSeqlevels(transposons, wanted, pruning.mode = "coarse")

# order levels identically to genome, then copy seqinfo
seqlevels(transposons_chr) <- wanted
seqinfo(transposons_chr)  <- seqinfo(genome_chr)[wanted]

# sanity:
seqlevels(transposons_chr)   # should be chr1..chr31 only

[1] “chr1” “chr2” “chr3” “chr4” “chr5” “chr6” “chr7” “chr8” “chr9” “chr10”

[11] “chr11” “chr12” “chr13” “chr14” “chr15” “chr16” “chr17” “chr18” “chr19” “chr20”

[21] “chr21” “chr22” “chr23” “chr24” “chr25” “chr26” “chr27” “chr28” “chr29” “chr30”

[31] “chr31”

geneinfo <- mcols(transposons_chr)[c("id","source")]
geneinfo

DataFrame with 142603 rows and 2 columns

id source

1 te_intact_3587 HiTE

2 te_intact_96238 HiTE

3 te_intact_29975 HiTE

4 te_intact_98614 HiTE

5 te_intact_87947 HiTE

… … …

142599 te_intact_123779 HiTE

142600 te_intact_79331 HiTE

142601 te_intact_2287 HiTE

142602 te_intact_1168 HiTE

142603 te_intact_104684 HiTE

seqinfo(transposons_chr) <- seqinfo(genome_chr)
transposons_chr <- trim(transposons_chr)
transposons_seqs <- getSeq(genome_chr, transposons_chr)

Body

length(transposons_seqs)

[1] 142603

transposons_seqs

DNAStringSet object of length 142603:

width seq

[1] 4981 GGGGGTGGCGAAGCACCTCTGCTCCGTACGGAGCGA…ATAGTCTGAAGTAGCGAACGCTGAAGAAGGAGAAGA

[2] 810 CAGTGGCGTCTCGTGACCTAAAATTTAACAGGGGCT…TAAGTCCCGATTGCCCCAATAGACGAGACTCCACTG

[3] 558 TGCCAAAGGCGGCACATAGGGCAGAGGTTTTACATC…TGTGTTTATTTCCTTCATTGTGTCTCCGTCTTTTGA

[4] 1465 GGCAAAAGCTGCCGACGTCTGGGTCAGCGACTCAAA…ATGAAAATGTCAAATTTCAATCACTGAAGAATAAGA

[5] 1108 TTTATTTATTTATTTAGTCAAACGAGCCAACATCAC…GAGGACTGCATCATCCATGACCGGAAACCGGAAGTC

… … …

[142599] 327 GGTGATTGGTATGAAAATACGGATGTAAAATAATAG…CTCGAAGGATAGCTTCCAAAAAGACTGCTTAAAGAG

[142600] 328 TTTATTTATTTATTTATTTTTTCTCTTTTTTATTTT…GTCCGATGCTCTACTGACTTTGCTATCCTGGCCGCC

[142601] 492 CCGGTCGTTGTTTTACTCTCGCCCACAGATCCTCGT…TGCTGTCTTCTCTTGCATTTGTGCCTAGCTATGGAA

[142602] 659 ATACAAATAACAGAGAGCAAACAACACTGGGAAACA…ACCTGTGCCTCCAGGTGGCCAAAAGGTGAAGAAGAA

[142603] 1550 CTTCTTCTTCTTCCGCGTTCGAAAATTAACATTTTC…GCCAACCTCTCCCATAAGAGGCGCTTTGTCCCTCCC

Counting

letterFrequency_all <- letterFrequency(transposons_seqs, c("A", "T", "C", "G")) %>% as.data.frame %>% tibble()
letterFrequency_all
C_all <- letterFrequency_all[3]
G_all <- letterFrequency_all[4]
Width_all <- width(transposons_seqs)
CG_all <- dinucleotideFrequency(transposons_seqs)[,"CG"]
OE_all = (CG_all*Width_all)/(C_all*G_all)
colnames(OE_all) <- c("OEindex")
OE_all
cgContent_all = (C_all+G_all)/Width_all
colnames(cgContent_all) <- c("CGpercentage")
cgContent_all
df <- cbind(OE_all,cgContent_all)
df

Graphs

x <- OE_all$OEindex
x <- x[is.finite(x)]
bw <- 2 * IQR(x) / length(x)^(1/3)
if (!is.finite(bw) || bw <= 0) bw <- diff(range(x)) / 80
xmin <- floor(min(x) * 10) / 10
xmax <- ceiling(max(x) * 10) / 10

ggplot(OE_all, aes(x = OEindex)) +
  geom_histogram(aes(y = after_stat(density)),
                 binwidth = bw, boundary = 0, closed = "left",
                 fill = "#4C7BA8", color = "white", alpha = 0.95) +
  geom_density(linewidth = 0.9, color = "black", alpha = 0.6) +
  scale_x_continuous(
    breaks = seq(xmin, xmax, by = 0.1),
    minor_breaks = seq(xmin, xmax, by = 0.05),
    labels = number_format(accuracy = 0.1),
    expand = expansion(mult = c(0.01, 0.02))
  ) +
  labs(
    title = "O/E CpG Ratio Distribution — Intact transposons (Body)",
    x = "O/E index",
    y = "Density"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    panel.grid.minor = element_line(),
    plot.title = element_text(face = "bold")
  )

x <- cgContent_all$CGpercentage
x <- x[is.finite(x)]
bw <- 2 * IQR(x) / length(x)^(1/3)
if (!is.finite(bw) || bw <= 0) bw <- diff(range(x)) / 80
xmin <- floor(min(x) * 10) / 10
xmax <- ceiling(max(x) * 10) / 10

ggplot(cgContent_all, aes(x = CGpercentage)) +
  geom_histogram(aes(y = after_stat(density)),
                 binwidth = bw, boundary = 0, closed = "left",
                 fill = "#4C7BA8", color = "white", alpha = 0.95) +
  geom_density(linewidth = 0.9, color = "black", alpha = 0.6) +
  scale_x_continuous(
    breaks = seq(xmin, xmax, by = 0.1),
    minor_breaks = seq(xmin, xmax, by = 0.05),
    labels = number_format(accuracy = 0.1),
    expand = expansion(mult = c(0.01, 0.02))
  ) +
  labs(
    title = "CG% — Intact transposons (Body)",
    x = "CG%",
    y = "Density"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    panel.grid.minor = element_line(),
    plot.title = element_text(face = "bold")
  )

plot_nice_dark <- function(d, x="OEindex", y="CGpercentage",
                           xthr=.65, ythr=.55, title="Scatter - Intact transposons - Body",
                           point_col="#10b981", thr_col="#7f1d1d"){   # green, violet
  dd <- data.frame(x=as.numeric(d[[x]]), y=as.numeric(d[[y]]))
  dd <- dd[is.finite(dd$x) & is.finite(dd$y), ]
  ggplot(dd, aes(x, y)) +
    geom_point(color=point_col, alpha=.75, size=1.6,
               position=position_jitter(width=.004, height=0)) +
    geom_vline(xintercept=xthr, color=thr_col, linewidth=.6, linetype="dotted") +
    geom_hline(yintercept=ythr, color=thr_col, linewidth=.6, linetype="dotted") +
    labs(title=title, x="O/E index", y="CG (%)") +
    theme_minimal(base_size=12) +
    theme(
      plot.background  = element_rect(fill="black", color=NA),
      panel.background = element_rect(fill="black", color=NA),
      panel.grid       = element_line(color="#242424"),
      axis.text        = element_text(color="white"),
      axis.title       = element_text(color="white"),
      plot.title       = element_text(color="white", face="bold")
    )
}

# use:
plot_nice_dark(df)

PCA

# 1) Clean: keep numeric cols, drop NA/Inf rows, drop zero-variance cols
X <- df[, sapply(df, is.numeric), drop = FALSE]
X <- X[Reduce(`&`, lapply(X, is.finite)), , drop = FALSE]
X <- X[, sapply(X, function(z) sd(z) > 0), drop = FALSE]

# 2) PCA
pca <- prcomp(X, center = TRUE, scale. = TRUE)
pc  <- as.data.frame(pca$x[, 1:2, drop = FALSE])

ggplot(pc, aes(PC1, PC2)) +
  geom_point(alpha = .7, size = 1.6, color = "#10b981") +
  theme_minimal(base_size = 13) +
  labs(title = "PCA of body", x = "PC1", y = "PC2")

summary(pca)

Importance of components:

PC1 PC2

Standard deviation 1.0317 0.9672

Proportion of Variance 0.5322 0.4678

Cumulative Proportion 0.5322 1.0000

cor(df$OEindex, df$CGpercentage)

[1] NA

K-Means

# clean once; keep the mask so we can put labels back correctly
X <- df[, c("OEindex","CGpercentage")]
m <- is.finite(X$OEindex) & is.finite(X$CGpercentage)
Xc <- scale(X[m, , drop=FALSE])

set.seed(42)
k  <- 7
km <- kmeans(Xc, centers=k, nstart=50)

df$cluster <- NA_integer_
df$cluster[m] <- km$cluster
df$cluster <- factor(df$cluster)

ggplot(df[!is.na(df$cluster),], aes(OEindex, CGpercentage, color=cluster)) +
  geom_point(alpha=.75, size=1.7) +
  theme_minimal(base_size=13) +
  labs(title=sprintf("k-means (k=%d) Body", k),
       x="O/E index", y="CG (%)")

#Add names
df <- cbind(geneinfo, df)
df

DataFrame with 142603 rows and 5 columns

id source OEindex CGpercentage cluster

1 te_intact_3587 HiTE 0.750453 0.505722 5

2 te_intact_96238 HiTE 0.965330 0.335802 1

3 te_intact_29975 HiTE 0.794589 0.379928 1

4 te_intact_98614 HiTE 0.753159 0.509215 5

5 te_intact_87947 HiTE 1.076524 0.466606 3

… … … … … …

142599 te_intact_123779 HiTE 0.632146 0.373089 1

142600 te_intact_79331 HiTE 0.753551 0.423780 6

142601 te_intact_2287 HiTE 0.475133 0.414634 4

142602 te_intact_1168 HiTE 0.396988 0.433991 4

142603 te_intact_104684 HiTE 0.850521 0.512258 5

#Body list with cgi arquitecture
df_filtered_cgi_body <- dplyr::filter(as.data.frame(df), OEindex >= 0.65, CGpercentage >= .55)
df_filtered_cgi_body
C_all_sum <- sum(C_all)
G_all_sum <- sum(G_all)
Width_all_sum <- sum(width(transposons_seqs))
CG_all_sum <- sum(CG_all)
C_all_sum <- as.numeric(C_all_sum)
G_all_sum  <- as.numeric(G_all_sum)
Width_all_sum <- as.numeric(Width_all_sum)
CG_all_sum <- as.numeric(CG_all_sum)
#CG% sum - Body
cgContent_all_sum <- (C_all_sum+G_all_sum)/Width_all_sum
cgContent_all_sum

[1] 0.4334743

#OE Index sum - Body
OE_all_sum <- (CG_all_sum*Width_all_sum)/(C_all_sum*G_all_sum)
OE_all_sum

[1] 0.7560854

#Body sums totals
C_all_sum

[1] 24643483

G_all_sum 

[1] 24916540

Width_all_sum

[1] 114332084

CG_all_sum 

[1] 4060622

cgContent_all_sum

[1] 0.4334743

OE_all_sum

[1] 0.7560854

#Porcentaje/Ratio de GCs en el cuerpo segun el span del cuerpo
CG_all_sum/Width_all_sum

[1] 0.03551603

#Porcentaje/Ratio de GC del cuerpo en comparacion al genoma
genomeCG_totalcount = 45535149
CG_all_sum/45535149

[1] 0.08917555

PROMOTORES

promotores_500bp <- promoters(transposons_chr, upstream = 2000, downstream = 0)


seqlevelsStyle(promotores_500bp) <- seqlevelsStyle(genome)
promotores_500bp <- keepSeqlevels(
  promotores_500bp,
  intersect(seqlevels(promotores_500bp), seqlevels(genome)),
  pruning.mode = "coarse"
)
seqinfo(promotores_500bp) <- seqinfo(genome)[seqlevels(promotores_500bp)]
promotores_500bp <- trim(promotores_500bp)

promotores_500bp <- promotores_500bp[width(promotores_500bp) > 0]

starts <- start(promotores_500bp)
ends   <- end(promotores_500bp)
lens   <- seqlengths(promotores_500bp)[as.character(seqnames(promotores_500bp))]
start(promotores_500bp) <- pmax(1L, starts)
end(promotores_500bp)   <- pmin(ends, lens)


seqs_promotores_500bp <- getSeq(genome_chr, promotores_500bp)


length(seqs_promotores_500bp)

[1] 142603

seqs_promotores_500bp

DNAStringSet object of length 142603:

width seq

[1] 2000 TCAGTTTAGTGCAGCCAGCCACGGGGTCTATGATGG…GTGGAACATCTGCGGCCTAGGAAATAAAACACCATA

[2] 2000 AAACAGATATACACTTGCTGTTGATGATTTTTCTTT…AAAGAGACTGCGATAATCGGTAATCATTCAGTTCAT

[3] 2000 ACAATGGACAGTGACATAGAAATGAATGTTATTTGT…TCTTGCGCATCACGCATCACTGAAACTTAGTTAGTT

[4] 2000 TCTCTCAACAAAAGACCAGCCCCACACTTCCTCAAT…AAGCAATTTCAATCAGCGCGGGAAGGAGAAGTGGCT

[5] 2000 AGCTGCCCCTGCTCGCGCCCCCTCAACAACAGAGAC…TACAGCAGCCAGTCAGCAGTCAGCGGTGCTTACATT

… … …

[142599] 2000 TAGAGGACTTGAACCCATGCATGTGGTCACGGGTCA…TGACCTGATGTGTCTGATCGAGGCAAGATTAGCTAA

[142600] 2000 GCGGGCCACTGCGACATCCAAGGCAACGAGAGGGCG…TATATTACTCTGCATGACATTTTCTCATATGTCTTC

[142601] 2000 AATTTACAAAGGCCTACTATCTAATCTCTTGAAACA…GTCATCTCCACCTTCTATATCTCAGTTCTTGTTCCA

[142602] 2000 AACAGTGGTGGTGGCTTTAGTTAGTTAGTCTTGCCT…AAATACATGGAGACAAAACAAAGACTGTGAGACTCA

[142603] 2000 CGGACGTGAAAAAGATGAAACATAAATAGACTCGCC…GCTTCTCACCGCTTTGTTAATTCTTTACTTCAGTTA

Counting

letterFrequency_all_promoters <- letterFrequency(seqs_promotores_500bp, c("A", "T", "C", "G")) %>% as.data.frame %>% tibble()
letterFrequency_all_promoters
C_all_promoters <- letterFrequency_all_promoters[3]
G_all_promoters <- letterFrequency_all_promoters[4]
Width_all_promoters <- width(seqs_promotores_500bp)
CG_all_promoters <- dinucleotideFrequency(seqs_promotores_500bp)[,"CG"]
C_all_promoters
G_all_promoters
Width_all_promoters 

[1] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[17] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[33] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[49] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[65] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[81] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[97] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[113] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[129] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[145] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[161] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[177] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[193] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[209] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[225] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[241] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[257] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[273] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[289] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[305] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[321] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[337] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[353] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[369] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

[385] 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000

… [6226 líneas truncadas]

CG_all_promoters

[1] 53 37 55 41 97 209 173 15 16 12 77 60 94 31 7 96 56 27 70 59

[21] 60 62 27 22 35 107 38 14 15 60 52 60 45 64 28 53 43 29 29 92

[41] 34 66 81 62 33 10 62 43 85 64 62 22 47 55 29 30 29 26 70 70

[61] 57 65 67 58 84 51 41 59 58 65 78 60 121 70 56 51 26 69 103 105

[81] 107 102 49 90 77 42 120 89 55 44 29 29 29 29 78 60 44 22 27 93

[101] 21 74 26 20 58 39 16 37 44 28 5 98 93 36 23 13 44 85 95 74

[121] 74 73 73 27 39 113 12 81 66 68 68 76 46 42 17 15 20 17 25 50

[141] 75 70 45 18 25 34 82 57 83 24 25 25 1 43 61 19 23 17 72 33

[161] 25 2 14 31 53 20 59 29 25 71 205 210 203 210 210 216 45 83 55 54

[181] 57 61 76 68 57 110 48 62 100 54 40 54 40 56 45 98 102 60 66 64

[201] 50 24 29 24 66 66 72 38 43 63 46 59 70 75 73 53 72 34 50 70

[221] 51 57 51 52 69 25 15 11 66 28 47 95 50 87 85 83 75 78 51 21

[241] 35 59 81 61 59 51 63 57 90 67 72 70 71 50 77 57 23 34 51 71

[261] 59 66 48 66 29 26 54 56 54 13 22 60 20 75 76 74 74 74 74 69

[281] 80 75 60 63 51 51 92 50 17 11 24 71 74 64 41 46 68 84 54 30

[301] 24 59 68 24 30 22 76 42 74 46 42 35 46 60 80 70 22 32 76 48

[321] 55 31 27 58 65 31 50 63 51 18 38 48 63 89 53 60 32 59 66 59

[341] 52 16 24 27 59 18 28 28 85 56 55 44 44 69 62 59 68 66 40 56

[361] 56 47 95 23 52 47 38 50 56 53 49 30 89 67 67 91 73 74 30 40

[381] 38 62 88 70 53 72 54 11 43 26 37 27 31 46 77 57 69 57 29 124

[401] 43 49 20 34 77 56 56 56 56 61 58 46 73 82 49 14 57 83 69 60

[421] 54 44 52 52 55 54 105 105 106 106 30 43 79 55 43 55 64 77 79 62

[441] 59 71 22 49 46 49 32 44 46 67 30 65 81 16 40 71 47 37 19 65

[461] 59 48 76 86 95 76 59 52 60 60 51 46 53 48 44 70 63 53 67 43

[481] 51 56 57 78 79 50 64 34 84 35 47 51 34 74 92 30 76 54 50 62

… [4976 líneas truncadas]

OE_all_promoters = (CG_all_promoters*Width_all_promoters)/(C_all_promoters*G_all_promoters)
colnames(OE_all_promoters) <- c("OEindex")
OE_all_promoters 
cgContent_all_promoters = (C_all_promoters+G_all_promoters)/Width_all_promoters
colnames(cgContent_all_promoters) <- c("CGpercentage")
cgContent_all_promoters
df_promoters <- cbind(OE_all_promoters,cgContent_all_promoters)
df_promoters

Graphs

x <- OE_all_promoters$OEindex
x <- x[is.finite(x)]
bw <- 2 * IQR(x) / length(x)^(1/3)
if (!is.finite(bw) || bw <= 0) bw <- diff(range(x)) / 80
xmin <- floor(min(x) * 10) / 10
xmax <- ceiling(max(x) * 10) / 10

ggplot(OE_all_promoters, aes(x = OEindex)) +
  geom_histogram(aes(y = after_stat(density)),
                 binwidth = bw, boundary = 0, closed = "left",
                 fill = "#4C7BA8", color = "white", alpha = 0.95) +
  geom_density(linewidth = 0.9, color = "black", alpha = 0.6) +
  scale_x_continuous(
    breaks = seq(xmin, xmax, by = 0.1),
    minor_breaks = seq(xmin, xmax, by = 0.05),
    labels = number_format(accuracy = 0.1),
    expand = expansion(mult = c(0.01, 0.02))
  ) +
  labs(
    title = "O/E CpG Ratio Distribution — Intact transposons (Promoters)",
    x = "O/E index",
    y = "Density"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    panel.grid.minor = element_line(),
    plot.title = element_text(face = "bold")
  )

x <- cgContent_all_promoters$CGpercentage
x <- x[is.finite(x)]
bw <- 2 * IQR(x) / length(x)^(1/3)
if (!is.finite(bw) || bw <= 0) bw <- diff(range(x)) / 80
xmin <- floor(min(x) * 10) / 10
xmax <- ceiling(max(x) * 10) / 10

ggplot(cgContent_all_promoters, aes(x = CGpercentage)) +
  geom_histogram(aes(y = after_stat(density)),
                 binwidth = bw, boundary = 0, closed = "left",
                 fill = "#4C7BA8", color = "white", alpha = 0.95) +
  geom_density(linewidth = 0.9, color = "black", alpha = 0.6) +
  scale_x_continuous(
    breaks = seq(xmin, xmax, by = 0.1),
    minor_breaks = seq(xmin, xmax, by = 0.05),
    labels = number_format(accuracy = 0.1),
    expand = expansion(mult = c(0.01, 0.02))
  ) +
  labs(
    title = "CG% — Intact transposons (Promoters)",
    x = "CG%",
    y = "Density"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    panel.grid.minor = element_line(),
    plot.title = element_text(face = "bold")
  )

plot_nice_dark <- function(d, x="OEindex", y="CGpercentage",
                           xthr=.65, ythr=.55, title="Scatter - Intact transposons - Promoters",
                           point_col="#10b981", thr_col="#7f1d1d"){   # green, violet
  dd <- data.frame(x=as.numeric(d[[x]]), y=as.numeric(d[[y]]))
  dd <- dd[is.finite(dd$x) & is.finite(dd$y), ]
  ggplot(dd, aes(x, y)) +
    geom_point(color=point_col, alpha=.75, size=1.6,
               position=position_jitter(width=.004, height=0)) +
    geom_vline(xintercept=xthr, color=thr_col, linewidth=.6, linetype="dotted") +
    geom_hline(yintercept=ythr, color=thr_col, linewidth=.6, linetype="dotted") +
    labs(title=title, x="O/E index", y="CG (%)") +
    theme_minimal(base_size=12) +
    theme(
      plot.background  = element_rect(fill="black", color=NA),
      panel.background = element_rect(fill="black", color=NA),
      panel.grid       = element_line(color="#242424"),
      axis.text        = element_text(color="white"),
      axis.title       = element_text(color="white"),
      plot.title       = element_text(color="white", face="bold")
    )
}

# use:
plot_nice_dark(df_promoters)

PCA

# 1) Clean: keep numeric cols, drop NA/Inf rows, drop zero-variance cols
X <- df_promoters[, sapply(df_promoters, is.numeric), drop = FALSE]
X <- X[Reduce(`&`, lapply(X, is.finite)), , drop = FALSE]
X <- X[, sapply(X, function(z) sd(z) > 0), drop = FALSE]

# 2) PCA
pca <- prcomp(X, center = TRUE, scale. = TRUE)
pc  <- as.data.frame(pca$x[, 1:2, drop = FALSE])

ggplot(pc, aes(PC1, PC2)) +
  geom_point(alpha = .7, size = 1.6, color = "#10b981") +
  theme_minimal(base_size = 13) +
  labs(title = "PCA Promoters", x = "PC1", y = "PC2")

summary(pca)

Importance of components:

PC1 PC2

Standard deviation 1.0570 0.9395

Proportion of Variance 0.5587 0.4413

Cumulative Proportion 0.5587 1.0000

cor(df$OEindex, df$CGpercentage)

[1] NA

K-Means

# clean once; keep the mask so we can put labels back correctly
X <- df_promoters[, c("OEindex","CGpercentage")]
m <- is.finite(X$OEindex) & is.finite(X$CGpercentage)
Xc <- scale(X[m, , drop=FALSE])

set.seed(42)
k  <- 2
km <- kmeans(Xc, centers=k, nstart=50)

df$cluster <- NA_integer_
df$cluster[m] <- km$cluster
df$cluster <- factor(df$cluster)

ggplot(df[!is.na(df$cluster),], aes(OEindex, CGpercentage, color=cluster)) +
  geom_point(alpha=.75, size=1.7) +
  theme_minimal(base_size=13) +
  labs(title=sprintf("k-means (k=%d) Promoters", k),
       x="O/E index", y="CG (%)")

#Add names
df_promoters <- cbind(geneinfo, df_promoters)
df_promoters

DataFrame with 142603 rows and 4 columns

id source OEindex CGpercentage

1 te_intact_3587 HiTE 0.335549 0.5680

2 te_intact_96238 HiTE 0.605679 0.3510

3 te_intact_29975 HiTE 0.769877 0.3780

4 te_intact_98614 HiTE 0.407563 0.4985

5 te_intact_87947 HiTE 0.662824 0.5440

… … … … …

142599 te_intact_123779 HiTE 0.547046 0.4325

142600 te_intact_79331 HiTE 0.470810 0.4250

142601 te_intact_2287 HiTE 0.524417 0.4380

142602 te_intact_1168 HiTE 0.569973 0.4155

142603 te_intact_104684 HiTE 0.898042 0.4470

#Promoters list with cgi arquitecture
df_filtered_cgi_promoters <- dplyr::filter(as.data.frame(df_promoters), OEindex >= 0.65, CGpercentage >= .55)
df_filtered_cgi_promoters

Promoters Sum Count

C_all_sum_promoters <- sum(C_all_promoters)
G_all_sum_promoters <- sum(G_all_promoters)
Width_all_sum_promoters <- sum(width(seqs_promotores_500bp))
CG_all_sum_promoters <- sum(CG_all_promoters)
C_all_sum_promoters <- as.numeric(C_all_sum_promoters)
G_all_sum_promoters  <- as.numeric(G_all_sum_promoters)
Width_all_sum_promoters <- as.numeric(Width_all_sum_promoters)
CG_all_sum_promoters <- as.numeric(CG_all_sum_promoters)
C_all_sum_promoters

[1] 57761228

G_all_sum_promoters 

[1] 57924169

Width_all_sum_promoters 

[1] 285204015

CG_all_sum_promoters 

[1] 7375038

#CG% sum - Body
cgContent_all_sum_promoters <- (C_all_sum_promoters+G_all_sum_promoters)/Width_all_sum_promoters
cgContent_all_sum_promoters

[1] 0.4056233

#OE Index sum - Body
OE_all_sum_promoters <- (CG_all_sum_promoters*Width_all_sum_promoters)/(C_all_sum_promoters*G_all_sum_promoters)
OE_all_sum_promoters

[1] 0.6286713

#Body sums totals
C_all_sum_promoters

[1] 57761228

G_all_sum_promoters 

[1] 57924169

Width_all_sum_promoters

[1] 285204015

CG_all_sum_promoters 

[1] 7375038

cgContent_all_sum_promoters

[1] 0.4056233

OE_all_sum_promoters

[1] 0.6286713

#Porcentaje/Ratio de GCs en el cuerpo segun el span del promoters
CG_all_sum_promoters/Width_all_sum_promoters

[1] 0.02585882

#Porcentaje/Ratio de GC del cuerpo en comparacion al genoma
genomeCG_totalcount = 45535149
CG_all_sum_promoters/45535149

[1] 0.1619636